{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# !pip3.8 uninstall trl -y\n",
    "# !pip3.8 install git+https://github.com/malaysia-ai/trl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-08-19 15:18:55.700269: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2023-08-19 15:18:55.867081: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
      "2023-08-19 15:18:56.663426: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n",
      "2023-08-19 15:18:56.663499: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n",
      "2023-08-19 15:18:56.663502: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
     ]
    }
   ],
   "source": [
    "from transformers import (\n",
    "    AutoModelForCausalLM,\n",
    "    AutoTokenizer,\n",
    "    BitsAndBytesConfig,\n",
    "    HfArgumentParser,\n",
    "    AutoTokenizer,\n",
    "    TrainingArguments,\n",
    ")\n",
    "\n",
    "from trl import SFTTrainer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf', trust_remote_code=True)\n",
    "tokenizer.pad_token = tokenizer.unk_token\n",
    "tokenizer.padding_side = \"right\"\n",
    "max_seq_length = 2048"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'</s>'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.eos_token"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.pad_token_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "!head -n 1000 shuf-combine-1536.jsonl > example.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from trl.trainer.utils import ConstantLengthDataset\n",
    "from datasets import load_dataset\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'</s>'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.eos_token"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration default-e556c340d5480304\n",
      "Found cached dataset json (/home/husein/.cache/huggingface/datasets/json/default-e556c340d5480304/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5d91283f6ef84f459eae92481b8c20d9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1000 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "system_prompts = [\n",
    "    'You are an AI assistant',\n",
    "    'Anda adalah pembantu AI',\n",
    "    'Anda adalah pembantu AI yang berguna',\n",
    "    'You are a helpful assistant',\n",
    "    'Anda seorang pembantu yang berguna',\n",
    "]\n",
    "\n",
    "\n",
    "def generate_and_tokenize_prompt(row):\n",
    "    system_prompt = random.choice(system_prompts)\n",
    "    texts = [f'<s>[INST] <<SYS>>\\n{system_prompt}\\n<</SYS>>\\n\\n']\n",
    "\n",
    "    if '<bot>:' in row['input'] and row['output'] is None:\n",
    "        inputs, outputs = [], []\n",
    "        splitted = row['input'].split('<bot>:')\n",
    "        for i in range(len(splitted) - 1):\n",
    "            if i == 0:\n",
    "                human = splitted[i].replace('<manusia>:', '')\n",
    "            else:\n",
    "                human = splitted[i].split('<manusia>:')[1]\n",
    "            bot = splitted[i + 1].split('<manusia>:')[0]\n",
    "            inputs.append(human)\n",
    "            outputs.append(bot)\n",
    "    else:\n",
    "        inputs = [row['input']]\n",
    "        outputs = [row['output']]\n",
    "    for input, output in zip(inputs, outputs):\n",
    "        texts.append(f'{input} [/INST] {output} </s><s>[INST] ')\n",
    "    return tokenizer(''.join(texts))\n",
    "\n",
    "\n",
    "dataset = load_dataset(\"json\", data_files='example.jsonl', split=\"train\")\n",
    "dataset = dataset.map(generate_and_tokenize_prompt)\n",
    "dataset = dataset.remove_columns(['prompt_input', 'input', 'output'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['input_ids', 'attention_mask'],\n",
       "    num_rows: 1000\n",
       "})"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1c48fcb2a8f64457b163872b6ab6818c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from itertools import chain\n",
    "\n",
    "def group_texts(examples):\n",
    "    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}\n",
    "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
    "    total_length = (total_length // max_seq_length) * max_seq_length\n",
    "    result = {\n",
    "        k: [t[i: i + max_seq_length] for i in range(0, total_length, max_seq_length)]\n",
    "        for k, t in concatenated_examples.items()\n",
    "    }\n",
    "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
    "    return result\n",
    "\n",
    "lm_datasets = dataset.map(\n",
    "    group_texts,\n",
    "    batched=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "\n",
    "class Dataset(torch.utils.data.Dataset):\n",
    "    def __init__(self, lm_datasets):\n",
    "        self.lm_datasets = lm_datasets\n",
    "\n",
    "    def __getitem__(self, idx):\n",
    "        return self.lm_datasets[idx]\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.lm_datasets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "357"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clm_dataset = Dataset(lm_datasets)\n",
    "len(clm_dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "isinstance(clm_dataset, (torch.utils.data.IterableDataset, torch.utils.data.Dataset, ConstantLengthDataset))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import DataCollatorForLanguageModeling\n",
    "\n",
    "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataset_text_field = 'text'\n",
    "# use_formatting_func = None\n",
    "# max_seq_len = max_seq_length\n",
    "# def tokenize(element):\n",
    "#     outputs = tokenizer(\n",
    "#         element[dataset_text_field] if not use_formatting_func else formatting_func(element),\n",
    "#         truncation=True,\n",
    "#         padding=False,\n",
    "#         max_length=max_seq_len,\n",
    "#         return_overflowing_tokens=False,\n",
    "#         return_length=False,\n",
    "#     )\n",
    "\n",
    "#     if use_formatting_func and not self._dataset_sanity_checked:\n",
    "#         if not isinstance(formatting_func(element), list):\n",
    "#             raise ValueError(\n",
    "#                 \"The `formatting_func` should return a list of processed strings since it can lead to silent bugs.\"\n",
    "#             )\n",
    "#         else:\n",
    "#             self._dataset_sanity_checked = True\n",
    "\n",
    "#     return {\"input_ids\": outputs[\"input_ids\"], \"attention_mask\": outputs[\"attention_mask\"]}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from trl.trainer.utils import ConstantLengthDataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# constant = ConstantLengthDataset(\n",
    "#     tokenizer,\n",
    "#     dataset,\n",
    "#     dataset_text_field='text',\n",
    "#     formatting_func=None,\n",
    "#     seq_length=max_seq_length,\n",
    "#     infinite=True,\n",
    "#     num_of_sequences=1024,\n",
    "#     eos_token_id=tokenizer.eos_token_id,\n",
    "# )\n",
    "# i = iter(constant)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "b = [clm_dataset[0], clm_dataset[1]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
     ]
    }
   ],
   "source": [
    "b = data_collator(b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([   1,    1,  518,  ..., 1590,  574,  348])"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b['input_ids'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"<s><s> [INST] <<SYS>>\\nYou are an AI assistant\\n<</SYS>>\\n\\nterjemah `kaoz the whole world at marina wan to get out bt stuck` ke inggeris [/INST] Chaos! The whole world at Marina wants to get out but they're stuck. </s><s> [INST] <s><s> [INST] <<SYS>>\\nAnda seorang pembantu yang berguna\\n<</SYS>>\\n\\n Hasilkan penerangan bertulis terperinci tentang pemandangan yang mempesonakan di luar dataran bandar dengan bunga, bakul, muzik dan seruling.\\n [/INST]  Beberapa bunian muda bergerak melalui orang ramai, menjaja bunga yang mereka bawa dalam bakul tenunan yang besar. Yang lain duduk dalam kumpulan kecil di tepi jalan, memainkan lagu-lagu lembut pada tambor dan seruling dan bergoyang mengikut masa dengan muzik. Ramai orang yang lalu lalang memandang ke atas dengan rasa curiga atau jijik semasa mereka tergesa-gesa dalam perjalanan. </s><s> [INST] <s><s> [INST] <<SYS>>\\nAnda adalah pembantu AI yang berguna\\n<</SYS>>\\n\\n Apakah tanda dagangan Eddie Barclay?\\n [/INST]  Tanda dagangan Eddie Barclay ialah sut putih, dan pesta Saint-Tropeznya di mana semua tetamu memakai pakaian putih menjadi acara besar untuk media Perancis. </s><s> [INST] <s><s> [INST] <<SYS>>\\nYou are a helpful assistant\\n<</SYS>>\\n\\n Projek A memerlukan 2 pembangun backend magento sepenuh masa dengan tarikh akhir pada 10 April. Projek B memerlukan 1 pembangun backend magento sepenuh masa dan 1 sambilan dengan tarikh akhir pada 11 Mei. Kedua-dua projek memerlukan 1 bulan 2 pembangun bahagian belakang magento berdedikasi sepenuh masa. Bagaimanakah sumber ini boleh digunakan untuk mencapai tarikh akhir kedua-duanya?\\n [/INST]  Untuk menyiapkan kedua-dua projek tepat pada masanya dan memenuhi tarikh akhir masing-masing, anda boleh memperuntukkan sumber dengan cara berikut:\\n\\n1. Mulakan dengan menugaskan 2 pembangun bahagian belakang Magento sepenuh masa kepada Projek A. Memandangkan ia mempunyai tarikh akhir pada 10 April, ia mengambil keutamaan berbanding Projek B.\\n2. Dari 16 Mac hingga 10 April (25 hari), 2 pembangun sepenuh masa ini akan mengusahakan Projek A. Dengan mengandaikan cuti hujung minggu, ini bermakna mereka akan mempunyai lebih kurang 18 hari bekerja.\\n3. Setelah Projek A selesai pada 10 April, anda boleh memperuntukkan sumber kepada Projek B. Berikan 1 pembangun sepenuh masa dan 1 pemaju sambilan kepada Projek B.\\n4. Dari 11 April hingga 11 Mei (30 hari), pembangun ini akan mengusahakan Projek B. Dengan cuti hujung minggu, terdapat lebih kurang 21 hari bekerja.\\n5. Pembangun sepenuh masa akan mengusahakan Projek B untuk keseluruhan tempoh (21 hari bekerja). Pembangun sambilan boleh bekerja setengah hari, dengan berkesan menyediakan 10.5 hari bekerja tambahan (dengan mengandaikan kerja setengah hari bersamaan dengan 0.5 hari penuh).\\n6. Secara keseluruhan, Projek B akan menerima bersamaan 1 bulan 2 pemaju sepenuh masa: 21 hari (1 sepenuh masa) + 10.5 hari (1 sambilan) = 31.5 hari.\\n\\nDengan mengikuti pelan peruntukan sumber ini, anda boleh menyelesaikan kedua-dua projek dalam tarikh akhir yang diberikan.\\n </s><s> [INST]  Tetapi bagaimana jika pemaju sambilan mengambil cuti 2 minggu?\\n [/INST]  Jika pembangun sambilan mengambil cuti 2 minggu, anda perlu melaraskan peruntukan sumber untuk memastikan kedua-dua projek masih boleh disiapkan dalam tarikh akhir masing-masing.\\n\\n1. Seperti sebelum ini, tetapkan 2 pembangun bahagian belakang Magento sepenuh masa kepada Projek A dari 16 Mac hingga 10 April (25 hari, kira-kira 18 hari bekerja).\\n2. Setelah Projek A siap pada 10 April, anda boleh memperuntukkan sumber kepada Projek B. Berikan 1 pembangun sepenuh masa dan 1 pemaju sambilan kepada Projek B.\\n3. Pembangun sepenuh masa akan mengusahakan Projek B untuk keseluruhan tempoh dari 11 April hingga 11 Mei (30 hari, kira-kira 21 hari bekerja).\\n4. Pemaju sambilan, bagaimanapun, hanya akan tersedia untuk separuh daripada hari bekerja kerana cuti 2 minggu. Jika cuti hujung minggu, cuti 2 minggu terdiri daripada 10 hari bekerja. Oleh itu, pemaju sambilan akan tersedia selama kira-kira 11 hari bekerja (21 - 10) semasa Projek B.\\n5. Memandangkan pembangun separuh masa bekerja setengah hari, mereka akan menyumbang 5.5 hari (11 \\\\* 0.5) kerja sepenuh masa yang setara kepada Projek B.\\n6. Secara keseluruhan, Projek B akan menerima 26.5 hari kerja setara sepenuh masa (21 hari daripada pemaju sepenuh masa + 5.5 hari daripada pemaju sambilan).\\n\\nUntuk mengimbangi pengurangan sumbangan daripada pemaju sambilan dan masih memenuhi tarikh akhir untuk Projek B, anda boleh mempertimbangkan pilihan berikut:\\n\\nPilihan 1: Lanjutkan waktu bekerja untuk pembangun sepenuh masa di Projek B untuk menampung waktu kerja yang hilang. Pemaju sepenuh masa mungkin perlu bekerja lebih masa untuk memastikan projek disiapkan tepat pada masanya.\\n\\nPilihan 2: Jika boleh, ambil pemaju sementara untuk mengisi jurang yang dibuat oleh cuti pemaju sambilan. Pembangun\""
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode(b['input_ids'][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
